PART I:

library(readxl)
df <- read_excel("~/Desktop/Study/Courses/Data Analytics /Team Assignment /datasets_marketing_campaign_SF.xlsx")
clean.df <- function(x) {
  na.omit(x)
}

df <- clean.df(df)
#Data massaging
##Assigning binary value for Marital_Status variable
####0 Value
df$marital_binary <- gsub("Absurd", "0", df$Marital_Status)
df$marital_binary <- gsub("Alone", "0", df$marital_binary)
df$marital_binary <- gsub("Divorced", "0", df$marital_binary)
df$marital_binary <- gsub("Single", "0", df$marital_binary)
df$marital_binary <- gsub("Widow", "0", df$marital_binary)
df$marital_binary <- gsub("YOLO", "0", df$marital_binary)

####1 Value
df$marital_binary <- gsub("Together", "1", df$marital_binary)
df$marital_binary <- gsub("Married", "1", df$marital_binary)

df$marital_binary <- as.numeric(df$marital_binary)

##Converting Education into numeric: The higher the education the higher the assigned value 

df$eduction_level <- gsub("Basic", "1", df$Education)
df$eduction_level <- gsub("2n Cycle", "2", df$eduction_level)
df$eduction_level <- gsub("Graduation", "3", df$eduction_level)
df$eduction_level <- gsub("Master", "4", df$eduction_level)
df$eduction_level <- gsub("PhD", "5", df$eduction_level)

df$eduction_level <- as.numeric(df$eduction_level)

##Initialize blank variable for further calculations 
df$US <- df$NumWebPurchases
df$total_sales <- df$NumWebPurchases 

##Data massaging & initializing new columns 
df$US <- gsub("AUS","0",df$Country)
df$US <- gsub("US","1",df$US)
df$US <- gsub("[a-zA-Z]","0", df$US)
df$US <- gsub(pattern="[[:punct:]]","0", df$US)
df$US<- as.numeric(df$US)

df$total_sales <- df$NumWebPurchases + df$NumStorePurchases + df$NumCatalogPurchases
df$USSales <- df$US * df$total_sales
df$ROTW <- (df$US-1) * (-df$total_sales)

sub_us<-aggregate(cbind(total_sales) ~ US, data = df, FUN=sum, na.rm=TRUE)

df$age_at_purchase <- df$Year_Birth

df$Dt_Customer_Converted <- as.Date(df$Dt_Customer)
df$Dt_Customer_Converted <- as.numeric(format(df$Dt_Customer_Converted,format="%Y" ))
df$age_at_purchase <- df$Dt_Customer_Converted - df$Year_Birth

B. Does US fare significantly better than RoW (Rest of the World) in terms of total purchases

1. Regression Output

a. Output of the regression between Total Sales (y - Response) and other explanatory variables

df$US <- gsub("AUS","0",df$Country)
df$US <- gsub("US","1",df$US)
df$US <- gsub("[a-zA-Z]","0", df$US)
df$US <- gsub(pattern="[[:punct:]]","0", df$US)
df$US<- as.numeric(df$US)


df$total_sales <- df$NumWebPurchases + df$NumStorePurchases + df$NumCatalogPurchases
df$USSales <- df$US * df$total_sales
df$ROTW <- (df$US-1) * (-df$total_sales)

sub_us<-aggregate(cbind(total_sales) ~ US, data = df, FUN=sum, na.rm=TRUE)


world_linear <- lm(total_sales~Income+Kidhome+MntWines+MntFruits+MntMeatProducts+MntFishProducts+
                     MntSweetProducts+MntGoldProds+NumDealsPurchases+
                     AcceptedCmp5+Response, data = df)
summary(world_linear)
## 
## Call:
## lm(formula = total_sales ~ Income + Kidhome + MntWines + MntFruits + 
##     MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds + 
##     NumDealsPurchases + AcceptedCmp5 + Response, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.3408  -1.6577  -0.3319   1.8370  14.8419 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.912e+00  2.399e-01  20.473  < 2e-16 ***
## Income             3.965e-05  4.012e-06   9.883  < 2e-16 ***
## Kidhome           -2.406e+00  1.750e-01 -13.748  < 2e-16 ***
## MntWines           9.105e-03  3.285e-04  27.717  < 2e-16 ***
## MntFruits          1.147e-02  2.581e-03   4.446 9.20e-06 ***
## MntMeatProducts    3.946e-03  5.011e-04   7.874 5.33e-15 ***
## MntFishProducts    8.816e-03  1.945e-03   4.534 6.10e-06 ***
## MntSweetProducts   1.860e-02  2.465e-03   7.543 6.66e-14 ***
## MntGoldProds       1.476e-02  1.703e-03   8.668  < 2e-16 ***
## NumDealsPurchases  7.060e-01  4.192e-02  16.839  < 2e-16 ***
## AcceptedCmp5      -2.300e+00  3.485e-01  -6.600 5.12e-11 ***
## Response          -3.889e-01  2.238e-01  -1.738   0.0824 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.499 on 2204 degrees of freedom
## Multiple R-squared:  0.7653, Adjusted R-squared:  0.7641 
## F-statistic: 653.2 on 11 and 2204 DF,  p-value: < 2.2e-16

b. Output of the regression between Total RoTW Sales (y - Response) and other explanatory variables

ROTW_linear <- lm(ROTW~Income+Kidhome+MntWines+MntFruits+MntMeatProducts+MntFishProducts+
                    MntSweetProducts+MntGoldProds+NumDealsPurchases+
                    AcceptedCmp5+NumWebPurchases, data = df)
summary(ROTW_linear)
## 
## Call:
## lm(formula = ROTW ~ Income + Kidhome + MntWines + MntFruits + 
##     MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds + 
##     NumDealsPurchases + AcceptedCmp5 + NumWebPurchases, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.0552  -0.9634   0.1297   1.5645  13.6198 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.106e+00  2.905e-01  10.694  < 2e-16 ***
## Income             2.857e-05  4.722e-06   6.049 1.71e-09 ***
## Kidhome           -1.483e+00  2.076e-01  -7.145 1.22e-12 ***
## MntWines           5.748e-03  4.121e-04  13.948  < 2e-16 ***
## MntFruits          7.260e-03  3.035e-03   2.392 0.016825 *  
## MntMeatProducts    3.909e-03  5.883e-04   6.645 3.81e-11 ***
## MntFishProducts    7.902e-03  2.285e-03   3.458 0.000554 ***
## MntSweetProducts   1.251e-02  2.916e-03   4.290 1.86e-05 ***
## MntGoldProds       6.876e-03  2.028e-03   3.391 0.000709 ***
## NumDealsPurchases  1.842e-01  5.164e-02   3.568 0.000368 ***
## AcceptedCmp5      -1.125e+00  4.007e-01  -2.807 0.005047 ** 
## NumWebPurchases    9.973e-01  4.278e-02  23.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.112 on 2204 degrees of freedom
## Multiple R-squared:  0.7028, Adjusted R-squared:  0.7013 
## F-statistic: 473.8 on 11 and 2204 DF,  p-value: < 2.2e-16

c. Output of the regression between Total US Sales (y - Response) and other explanatory variables

US_linear <- lm(USSales~MntMeatProducts+
                  NumDealsPurchases+
                  +Response, data = df)
summary(US_linear)
## 
## Call:
## lm(formula = USSales ~ MntMeatProducts + NumDealsPurchases + 
##     +Response, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1170 -0.8883 -0.5482 -0.2681 28.0924 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.0799928  0.1267622   0.631   0.5281    
## MntMeatProducts    0.0017656  0.0003225   5.474 4.89e-08 ***
## NumDealsPurchases  0.1495908  0.0365244   4.096 4.36e-05 ***
## Response          -0.4290861  0.2008926  -2.136   0.0328 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.281 on 2212 degrees of freedom
## Multiple R-squared:  0.01883,    Adjusted R-squared:  0.0175 
## F-statistic: 14.15 on 3 and 2212 DF,  p-value: 3.878e-09

Details of how you computed Rest of the World and Total purchases

The total sales with the adjusted R squared of 76% in the multiple regression. Kidhome and acceptedcmp5 has a negative relation with the total sales. ROTW has a higher significance value with a postive relation for Mntfrits and Mntfish compared to the rest of the products (Mntsweetsproducts, Mntwine, Mntmeatproducts, Mntgold). The NumDealspurchase with an estimate of 0.7 has a strong impact on the total sales. Whereas the RTOW also have a strong relation with the Numdealspurchase with an estimate of 0.5 and with the r value of 62% states the RTOW and the totals sales are strongly related to the Numdealspurchases.

Analysis of the results

The multiple regression squared value shows a 62% relation with the ROTW (rest of the world) against the purchases of products. The kidhome and acceptedcmp5 show a negative co-efficient. For every kid home the ROTW has decrease by –2.18 times in Purchases. The NumDealsPurchases is strongly related to the ROTW with a estimate of 0.54 increase in purchases. RTOW showing a strong relation NumDealsPurchases, the CMO must offer deals to the customers to increase the purchases. Whereas the US market has a R of 1.7% with the purchases. The US market is positive with an estimate of 0.14 estimate increase in purchases.

Question C. Your supervisor insists that people who buy gold are more conservative and as such people who spent an above average amount on gold in last 2 years would have more in store purchases. Justify/refute the statement using appropriate statistical tests.

#H0 : mu1 - mu2 <= 0
#Ha : mu1 - mu2 > 0 

#Calculated the gold purchase mean
gold_mean <- mean(df$MntGoldProds)
df$date <- as.Date(df$Dt_Customer)
#Convert to gold purchase to binary 
#initialize the gold_binary columns
df$gold_binary <- c()

for(i in 1:nrow(df)){
  if (df$MntGoldProds[i]>gold_mean) {
    df$gold_binary[i] <- "1"
  }else{
    df$gold_binary[i]<- "0"
  } 
}
####Interaction store purchase and gold_binary 
#Convert to numeric type 
df$gold_binary <- as.numeric(df$gold_binary)

#Creating a interaction between number of store purchases and gold_binary 

df$store_x_gold <- df$NumStorePurchases*df$gold_binary

###Subsetting into two samples: customers purchasing gold more than the mean(gold)
#over the last 2 years and customers who don't. 
df_gold<-df[which(df$gold_binary==1),]
#Subsetting gold_binary = 0 and store to its own data frame 
df_gold0<-df[which(df$gold_binary!=1),]

#Statistical calculation 
#Legends: mean1 and s1 are average of NumStorePurchases and standard deviation of sample 1 (customers with gold purchases greater than the mean of gold)
mean1 <- mean(df_gold$NumStorePurchases)
s1<-sd(df_gold$NumStorePurchases)
n1<- nrow(df_gold)

#Legends: mean2 and s2 are average of NumStorePurchases and standard deviation of sample 2 (customers with gold purchases smaller than the mean of gold)
mean2 <- mean(df_gold0$NumStorePurchases)
s2 <- sd(df_gold0$NumStorePurchases)
n2 <- nrow(df_gold0)

##Calculating t-statistic 
t <- (mean1 - mean2)/(sqrt(s1^2/nrow(df_gold)+s2^2/nrow(df_gold0)))
##Since both samples have different variances, here is the calculation for the degree of freedom:
deg_f <- ((s1^2/nrow(df_gold)+s2^2/nrow(df_gold0))^2) / ((1/(nrow(df_gold)-1))*(s1^2/nrow(df_gold))^2 + (1/(nrow(df_gold0)-1))*(s2^2/nrow(df_gold0))^2)
##degree of freedom is greater than 1000 
##--> Conclusion: with the DoF of 1326, our t-statistic = 21 so we can reject the Ho. 

##Confounding Variables: Income since people with higher income tend to expense more on both in-store purchases and gold purchases 
#Plotting NumStorePurchases against Income 

library(plotly)
df$personality_type <- gsub("1", "Convervative", df$gold_binary)
df$personality_type <- gsub("0", "Less Convervative", df$personality_type)

org_scatter <- ggplot(data=df, aes(x= MntGoldProds, y=NumStorePurchases , color=personality_type)) + geom_point() + geom_smooth(method="lm")
#ggplotly(org_scatter)

test_values <- c(mean1, s1, n1, mean2, s2, n2, deg_f)

names_test <- c("Mean 1", "Standard Deviation 1", "Sample Size 1", "Mean 2", "Standard Deviation 2", "Sample Size 2", "Degrees of Freedom")

df_qc <- data.frame(names_test, test_values)
colnames(df_qc) <- c("Statistics","Values")

The table below included at the statistics derived from our two samples, in which necessary for our two-samples t-test for different in means.

##             Statistics      Values
## 1               Mean 1    7.767575
## 2 Standard Deviation 1    3.008214
## 3        Sample Size 1  697.000000
## 4               Mean 2    4.898618
## 5 Standard Deviation 2    2.946200
## 6        Sample Size 2 1519.000000
## 7   Degrees of Freedom 1325.738994

According to the our test with the DoF of 1326 and the t-statistic of 20.98, we are 95% confident that there is a relation between the amount spent on gold and the amount in store purchases.

Furthermore, the multiple regression analysis also point out the similar results, which consolidate our previous findings to a greater extent.

## `geom_smooth()` using formula 'y ~ x'

Question E Do any other analysis you deem relevant to show to your CMO. For the purpose, propose a hypothesis and perform the appropriate tests.

 Hypothesis A: Those who likes to purchase golds products tend to see the chart of gold on the web site. Given that situation, the people prefer to buy any product online.

 Hypothesis B: Those who are being rich likes to enjoy the day-off with wine & meat

 Hypothesis C: People who are educated prefer to spend on sweets.

#H0: mean online purchases gold <= mean of online purchases 
#Ha: mean online purchases gold > mean of online purchases 
mu1_hypA <- mean(df_gold$NumWebPurchases)
s1_hypA <- sd(df_gold$NumWebPurchases)
n1_hypA <- nrow(df_gold)

mu2_hypA <- mean(df_gold0$NumWebPurchases)
s2_hypA <- sd(df_gold0$NumWebPurchases)
n2_hypA <- nrow(df_gold0)

t_hypeA <- (mu1_hypA - mu2_hypA)/(sqrt(s1_hypA^2/nrow(df_gold)+s2_hypA^2/nrow(df_gold0)))
##Since both samples have different variances, here is the calculation for the degree of freedom:
deg_f_hypeA <- ((s1_hypA^2/nrow(df_gold)+s2_hypA^2/nrow(df_gold0))^2) / ((1/(nrow(df_gold)-1))*(s1_hypA^2/nrow(df_gold))^2 + (1/(nrow(df_gold0)-1))*(s2_hypA^2/nrow(df_gold0))^2)

According to the our test with the DoF of 1357 and the t-statistic of 19.67, we are 95% confident that there is a relation between the amount spent on gold and the amount in online purchases.

#H0: mean of amount spent on meat and wine of the rich customers <= mean of amount spent on meat and wine of the poor customers

#Ha: mean of amount spent on meat and wine of the rich customers > mean of amount spent on meat and wine of the poor customers

mean_income <- mean(df$Income)

df$income_binary <- c()

for(i in 1:nrow(df)){
  if (df$Income[i] > mean_income) {
    df$income_binary[i] <- "1"
  }else{
    df$income_binary[i]<- "0"
  } 
}
## Warning: Unknown or uninitialised column: `income_binary`.
df$sum_wine_meat <- df$MntMeatProducts + df$MntWines

df_rich <- df[which(df$income_binary==1),]

df_poor <- df[which(df$income_binary==0),]

#H0: mean online purchases gold <= mean of online purchases 
#Ha: mean online purchases gold > mean of online purchases 
mu1_hypB <- mean(df_rich$sum_wine_meat)
s1_hypB <- sd(df_rich$sum_wine_meat)
n1_hypB <- nrow(df_rich)

mu2_hypB <- mean(df_poor$sum_wine_meat)
s2_hypB <- sd(df_poor$sum_wine_meat)
n2_hypB <- nrow(df_poor)

t_hypeB <- (mu1_hypB - mu2_hypB)/(sqrt(s1_hypB^2/nrow(df_rich)+s2_hypB^2/nrow(df_poor)))
##Since both samples have different variances, here is the calculation for the degree of freedom:
deg_f_hypeB <- ((s1_hypB^2/nrow(df_rich)+s2_hypB^2/nrow(df_poor))^2) / ((1/(nrow(df_rich)-1))*(s1_hypB^2/nrow(df_rich))^2 + (1/(nrow(df_poor)-1))*(s2_hypB^2/nrow(df_poor))^2)

According to the our test with the DoF of 1349 and the t-statistic of 49.92, we are 95% confident that rich people (income more than average income) spend more on wine and meat products than the poor people.

#H0: mean of amount spent on sweet products of the more educated customers <= mean of amount spent on sweet products of the less educated customers

#Ha: mean of amount spent on sweet products of the more educated customers > mean of amount spent on sweet products of the less educated customers

mean_edu <- mean(df$eduction_level)

df$edu_binary <- c()

for(i in 1:nrow(df)){
  if (df$eduction_level[i] > mean_edu) {
    df$edu_binary[i] <- "1"
  }else{
    df$edu_binary[i]<- "0"
  } 
}
## Warning: Unknown or uninitialised column: `edu_binary`.
df_smart <- df[which(df$edu_binary==1),]

df_not_really_smart <- df[which(df$edu_binary==0),]


mu1_hypC <- mean(df_smart$MntSweetProducts)
s1_hypC <- sd(df_smart$MntSweetProducts)
n1_hypC <- nrow(df_smart)

mu2_hypC <- mean(df_not_really_smart$MntSweetProducts)
s2_hypC <- sd(df_not_really_smart$MntSweetProducts)
n2_hypC <- nrow(df_not_really_smart)

t_hypeC <- (mu1_hypC - mu2_hypC)/(sqrt(s1_hypC^2/nrow(df_smart)+s2_hypC^2/nrow(df_not_really_smart)))
##Since both samples have different variances, here is the calculation for the degree of freedom:
deg_f_hypeC <- ((s1_hypC^2/nrow(df_smart)+s2_hypC^2/nrow(df_not_really_smart))^2) / ((1/(nrow(df_smart)-1))*(s1_hypC^2/nrow(df_smart))^2 + (1/(nrow(df_not_really_smart)-1))*(s2_hypC^2/nrow(df_not_really_smart))^2)

According to the our test with the DoF of 2102 and the t-statistic of -6.24, we are 95% confident that we cannot reject the null hypothesis. Therefore, we do not need to make promotion on sweet products for more educated people.

PART II:

Q1. What factors seems to drive web purchases?

According to our analysis, we are 90% confident that income, number of kids in the household , number of teenagers in the household, amount spent on wine, meat, sweets, and gold, number of purchases made with discount and catalog, number of purchases made directly in stores, monthly number web visits, campaign 2 and campaign 5 seems to drive the web purchases

## 
## Call:
## lm(formula = NumWebPurchases ~ Income + Kidhome + Teenhome + 
##     Recency + MntWines + MntFruits + MntMeatProducts + MntFishProducts + 
##     MntSweetProducts + MntGoldProds + NumDealsPurchases + NumCatalogPurchases + 
##     NumStorePurchases + NumWebVisitsMonth + AcceptedCmp3 + AcceptedCmp4 + 
##     AcceptedCmp5 + AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp1 + 
##     Complain + Z_CostContact + Z_Revenue + Response, data = training_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5659 -0.9768 -0.1364  0.9265 22.7602 
## 
## Coefficients: (2 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.012e+00  2.576e-01  -3.930 8.82e-05 ***
## Income               1.071e-05  2.470e-06   4.337 1.52e-05 ***
## Kidhome             -7.297e-01  1.136e-01  -6.425 1.70e-10 ***
## Teenhome             3.491e-01  9.865e-02   3.539 0.000412 ***
## Recency              1.719e-03  1.583e-03   1.086 0.277673    
## MntWines             2.633e-03  2.352e-04  11.192  < 2e-16 ***
## MntFruits            2.112e-03  1.576e-03   1.340 0.180543    
## MntMeatProducts     -6.177e-04  3.452e-04  -1.790 0.073694 .  
## MntFishProducts      1.248e-03  1.188e-03   1.050 0.293643    
## MntSweetProducts     4.095e-03  1.531e-03   2.675 0.007553 ** 
## MntGoldProds         7.671e-03  1.040e-03   7.377 2.48e-13 ***
## NumDealsPurchases    1.866e-01  2.955e-02   6.314 3.44e-10 ***
## NumCatalogPurchases  2.973e-02  2.651e-02   1.122 0.262118    
## NumStorePurchases    1.875e-01  2.096e-02   8.945  < 2e-16 ***
## NumWebVisitsMonth    3.313e-01  2.727e-02  12.149  < 2e-16 ***
## AcceptedCmp3         1.361e-02  1.834e-01   0.074 0.940861    
## AcceptedCmp4        -1.643e-01  1.967e-01  -0.835 0.403692    
## AcceptedCmp5        -4.348e-01  2.185e-01  -1.990 0.046769 *  
## AcceptedCmp1         4.570e-02  2.087e-01   0.219 0.826713    
## AcceptedCmp2        -1.609e+00  3.997e-01  -4.025 5.94e-05 ***
## Complain             3.629e-02  4.608e-01   0.079 0.937238    
## Z_CostContact               NA         NA      NA       NA    
## Z_Revenue                   NA         NA      NA       NA    
## Response             3.177e-01  1.476e-01   2.153 0.031477 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.884 on 1750 degrees of freedom
## Multiple R-squared:  0.5158, Adjusted R-squared:   0.51 
## F-statistic: 88.78 on 21 and 1750 DF,  p-value: < 2.2e-16

Q2. Is there a relation between web visits and web purchases?

The scatter plot between numbers of web purchases and number of monthly web visits indicates a moderate relation between these two variables.

## 
##  Pearson's Chi-squared test
## 
## data:  df$NumWebPurchases and df$NumWebVisitsMonth
## X-squared = 1249.8, df = 210, p-value < 2.2e-16

Furthermore, according to the Chi-Squared test, there is a relationship between web purchases and number of web visit per month.

Q3. Is there a relation between geographical region and the success of a campaign?

## [1] 15.15094

Observed Frequencies Table

##   Country AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1     AUS            7            0            9            6           12
## 2      CA           18            6           18           24           21
## 3     GER            7            2           10           11            8
## 4     IND            7            2           13           10            6
## 5      ME            0            0            1            0            0
## 6      SA           20            4           21           20           21
## 7      SP           76           16           83           87           89
## 8      US            7            0            8            6            5

Expected Frequencies Table

##   Country AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1     AUS     7.304085   1.54311649    8.3842663    8.4357035    8.3328290
## 2      CA    18.689864   3.94856278   21.4538578   21.5854766   21.3222390
## 3     GER     8.163389   1.72465961    9.3706505    9.4281392    9.3131619
## 4     IND     8.163389   1.72465961    9.3706505    9.4281392    9.3131619
## 5      ME     0.214826   0.04538578    0.2465961    0.2481089    0.2450832
## 6      SA    18.475038   3.90317700   21.2072617   21.3373676   21.0771558
## 7      SP    75.403933  15.93040847   86.5552194   87.0862330   86.0242057
## 8      US     5.585477   1.18003026    6.4114977    6.4508321    6.3721634

According to our test, we cannot reject the null hypothesis since the Chi-Squared is 15.1509352 with a degrees of freedom of 28, at which is below the Chi-Squared significant level of 41.337. Therefore, both variables are independent to each other.

Plotting

Q4. What is the average amount spent on fruits?

According our data, the average amount of fruits purchased is 26.3560469

Q5. Is the variation in the amount spent on fish and and meat affected by any qualitative factors? If yes, which ones?

## 
## Call:
## lm(formula = MntMeatProducts + MntFishProducts ~ age_at_purchase + 
##     eduction_level + marital_binary + US + AcceptedCmp1 + AcceptedCmp2 + 
##     AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5 + Complain + Response, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -631.66 -147.94  -92.94   84.83 1569.31 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      137.4115    24.7592   5.550 3.20e-08 ***
## age_at_purchase    0.8385     0.4234   1.980  0.04781 *  
## eduction_level    -0.4946     5.0829  -0.097  0.92249    
## marital_binary    -8.4262    10.5606  -0.798  0.42502    
## US                31.5675    23.2469   1.358  0.17463    
## AcceptedCmp1     219.4270    22.9106   9.578  < 2e-16 ***
## AcceptedCmp2    -127.5156    45.8293  -2.782  0.00544 ** 
## AcceptedCmp3     -53.2672    19.9562  -2.669  0.00766 ** 
## AcceptedCmp4     -66.0198    21.0636  -3.134  0.00175 ** 
## AcceptedCmp5     285.2607    22.1809  12.861  < 2e-16 ***
## Complain         -52.3357    51.4492  -1.017  0.30916    
## Response          79.1251    15.8195   5.002 6.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 234 on 2204 degrees of freedom
## Multiple R-squared:  0.1915, Adjusted R-squared:  0.1874 
## F-statistic: 47.44 on 11 and 2204 DF,  p-value: < 2.2e-16

According our regression analysis, the significant qualitative factors are age at purchases, campaign acceptance and all our campaigns.

Correlation table on all variables

Relatively high correlated factors:

Q6. Fish has Omega 3 fatty acids, good for brain, accordingly, do people with advanced degrees purchase more fish than others?

According to our data, the customers buying the most amount of fish are the people with the 2n Cycle education level at the average of 48.04

Q7. Teenagers are fussy on food, which foods do families with teenagers spend most on?

##   MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1     20159          110127           27418            21414

According to our data, the households with teenagers spent on meat product the most at 110127

Q8. Which marketing campaign is most successful?

According to our data, Campaign 4 is the most accepted campaign with 164 responses accepted; however, both Campaign 3 and 5 are right behind with 163 and 162 accepted responses, respectively.

Q9. What kind of people seem to be complaining most?

According to the collected samples, there are 21 complains out of 2216 observations. Out of the 21 complaining observation:

  1. 67 % are customers with Graduation level.
  2. 62 % are customers with married or together status.
  3. 80.95 % are customers with children at home.
  4. 66.67 % of the complaints are from Spain.

Q10. What is the average age of customers given in the sample?

The average age of customers at the dates of their purchases is 44 years old.

The average of customer’s current age is 52 years old.